# 03_part_i_hierarchy.R
# Produces results from Part I: Which Categories Tend to Rank Highest?
# Summarize hierarchy among categories 
# Produces:
# - Main Figure 4 Category Ranking by PageRank Score 
# - Main Table 3. Descriptions of Most Highly Ranked Categories 
#                   across Housing Authorities
# - Table A1: Descriptions of Lower-Ranked Categories 
#               across Housing Authorities
# - Table S3: Category hierarchy rankings based on alternate 
#               measures of centrality.
# - Table S4: Comparison of PageRank scores and ranks with different 
#              amounts of down-weighting implicit ties.
# - Table S5: Category rankings according to PageRank scores, 
#                    given different values for the damping factor d 
# - Table S6: Comparison of PageRank scores and ranks, 
#          with and without weighting by voucher program size
# - all_edges.RDS: contains all implicit and explicit edges that prioritization
#                  policies draw. Used in subsequent clustering scripts. 

library(tidyverse)
library(igraph)
library(ggthemes)
library(xtable)
library(here)

# Load data on PHAs in sample
pha_pref_ref <- readRDS(here("data", "pha_pref_ref.RDS"))
# Load data on PHA characteristics
pha_all <-  readRDS(here("data", "pha_all.RDS")) %>%
  # Create variable for percent of all vouchers administered
  # by given PHA
  mutate(hud_perc_vouchers = hud_med_month_vouchers / sum(hud_med_month_vouchers) * 100)
# Load coded preference data
codes <- readRDS(here("data", "codes_expanded.RDS"))

# Create df with all PHAs in sample and their 
# preferences, including all PHAs with no preferences

# Append observations from Abt survey in which PHA
# reported having no preferences
abt_rows <- pha_pref_ref %>%
  filter(any_preferences == 0 & from_abt == TRUE) %>%
  select(pha_code)

all_prefs <- codes %>%
  select(pha_code:special_defs) %>%
  # Add rows for Abt entries
  bind_rows(abt_rows) %>%
  # Zero out rows with no preferences from Abt additions
  mutate_at(vars(unit_single:special_defs), list(~ifelse(is.na(.), 0, .))) 

##################
# Generate edges #
##################

# DRAW EXPLICIT EDGES
# (when one category is explicitly ranked more highly than another)
# Create df where each row represents an instance where a 
# category is listed at a given rank within a preference system
# The same category can appear at multiple ranks, but 
# not multiple times within a rank 
pref_ranked <- all_prefs %>%
  # Note: separating all combinations into individual categories
  reshape2::melt(id = c("pha_code", "order")) %>%
  # Retain all ranked 
  filter(value > 0 & !is.na(order)) %>%
  # Dedup within order ranks 
  #(when categories show up across multiple preferences at the same rank)
  group_by(pha_code, order, variable) %>% 
  filter(row_number() == 1) %>%
  # A category can appear across multiple ranks for a given PHA
  # Each appearance is retained 
  arrange(pha_code, order, variable) %>%
  select(-value) 

# Create df of prospective explicit tie origins 
explicit_pref_from <- pref_ranked %>%
  rename(category_from = variable,
         order_from = order) %>%
  mutate(implicit_link_base = FALSE) %>%
  ungroup()

# Create df of prospective explicit tie destinations
pref_to <- pref_ranked %>%
  rename(category_to = variable,
         order_to = order) %>%
  ungroup()

# Join these two dfs and filter 
# to only valid explicit edges
explicit_edges <- explicit_pref_from %>%
  left_join(pref_to, by = "pha_code") %>%
  # Exclude edges to self
  filter(category_from != category_to) %>%
  # Retain edges that go from lower ranked to higher ranked
  filter(order_from > order_to) %>%
  select(pha_code, implicit_link_base, category_from, category_to,
                order_from, order_to)

# DRAW IMPLICIT EDGES
# Implicit edges are drawn in two types of cases
# for a given PHA
# 1) From categories that are not part of the prioritization policy at all 
#    to categories that are prioritized in the policy
# 2) From categories that never appear alone as a preference to
#    all ranked categories 

# Identify the categories for a given PHA that appear solo in a preference

# Calculate number of categories selected per preference entry 
num_categories_selected <- rowSums(all_prefs[,-c(1:2)])

sole_category_listing <- all_prefs %>%
  ungroup() %>%
  # Join number of categories selected per preference row to each row
  mutate(num_categories_selected = num_categories_selected) %>%
  # Filter to only rows where a sole category was selected 
  filter(num_categories_selected == 1) %>%
  select(-num_categories_selected) %>%
  # Separating all combinations into individual categories 
  reshape2::melt(id = c("pha_code", "order")) %>%
  # Filter entries that are sole entries 
  filter(value == 1) %>%
  mutate(sole_category = TRUE) %>%
  # Deduplicate -- there are rare cases where
  # the same entry appears across the multiple ranks 
  # where there is a different variant (different tenant programs) 
  # of the category assigned different ranks
  group_by(pha_code, variable) %>%
  filter(row_number() == 1)

# Summary of which preference categories 
# should be included in the set of categories
# from which implicit ties should be drawn 
pref_implicit_base_summary <- all_prefs %>%
  # Filter out PHAs that didn't have any preferences
  filter(!is.na(order)) %>%
  select(-order) %>%
  group_by(pha_code) %>%
  # Flag what categories are used at least once by each PHA
  summarize_all(funs(max))  %>%
  gather(key = "category", value = "in_pref_system", -pha_code) %>%
  # Join with information about whether category is ever invoked 
  # as a single category in the preference system 
  left_join(sole_category_listing, 
            by = c("pha_code", "category" = "variable")) %>%
  select(-order, -value) %>%
  mutate(sole_category = ifelse(is.na(sole_category), 
                                FALSE, sole_category)) %>%
  # Identify categories for each PHA that should serve as the pool
  # from which an implicit edge will be drawn. 
  mutate(implicit_link_base = ifelse(in_pref_system == 0 | sole_category == FALSE, 
                                     TRUE, FALSE))

# Generate reference of lowest priority level to assign 
# to categories from which to draw implicit edges
lowest_priority <- pref_ranked %>% 
  group_by(pha_code) %>%
  summarize(lowest_priority_order = max(order) + 1)

# Identify subset of categories from which 
# implicit directed ties should originate
implicit_pref_from <- pref_implicit_base_summary %>%
  ungroup() %>%
  filter(implicit_link_base == TRUE) %>%
  left_join(lowest_priority, by = "pha_code") %>%
  # Set all implicit edge origination categories to 
  # be ranked at lowest possible priority level
  # for each PHA 
  mutate(order = lowest_priority_order) %>%
  select(-lowest_priority_order) %>%
  rename(category_from = category,
         order_from = order) 

# Draw implicit edges 
implicit_edges <- implicit_pref_from %>%
  left_join(pref_to, by = "pha_code") %>%
  # Exclude edges to self
  filter(category_from != category_to) %>%
  # Retain edges that go from lower ranked to higher ranked
  filter(order_from > order_to) %>%
  select(pha_code, implicit_link_base, category_from, category_to,
                order_from, order_to)

# Combine all edges
all_edges <- bind_rows(explicit_edges, implicit_edges) %>%
  select(-starts_with("order_"))

# Save all edges for use in clustering scripts 
saveRDS(all_edges, here("data", "intermediate", "all_edges.RDS"))

#############################
# Create adjacency matrixes #
#############################

#  Generate different weights for the edges 
all_edges_wgt <- all_edges %>%
  # Join on percent of vouchers administered
  left_join(select(pha_all, pha_code, hud_perc_vouchers), by = "pha_code") %>%
  mutate(even_wgt = 1,
         implicitdw25_wgt = ifelse(implicit_link_base == TRUE, 0.25, 1),
         implicitdw10_wgt = ifelse(implicit_link_base == TRUE, 0.1, 1),
         explicitonly_wgt = ifelse(implicit_link_base == TRUE, 0, 1),
         # Weight by percent of vouchers, implicit links down-weighted 
         # by factor of 0.25
         voucher_implicitdw25_wt = ifelse(implicit_link_base == TRUE, 
                                          0.25 * hud_perc_vouchers, 
                                          hud_perc_vouchers)
  )

# Function the generates an adjacency matrix object 
# Given a df of edges and weights associated with each edge 
get_adj <- function(edges_in, wgts_in) {
  # Sum up edge weights 
  adj <- edges_in %>%
    mutate(weight = wgts_in) %>%
    group_by(category_from, category_to) %>% 
    summarise(edge_weight = sum(weight)) %>%
    spread(category_to, edge_weight) %>%
    ungroup() %>%
    mutate_all(funs(ifelse(is.na(.), 0, .))) 
  
  rownames(adj) <- adj$category_from
  
  # Create graph object
  net_out <- adj %>%
    ungroup() %>%
    select(-category_from) %>%
    data.matrix() %>%
    graph.adjacency(mode = "directed", weighted = TRUE, diag = FALSE)
  return(net_out)
}

# Graph where all implicit and explicit edges are evenly weighed
net_nodw <- get_adj(all_edges_wgt, all_edges_wgt$even_wgt)
# Graph where all implicit edges are assigned weight factor of 0.25
net_dw25 <- get_adj(all_edges_wgt, all_edges_wgt$implicitdw25_wgt)
# Graph where all implicit edges are assigned weight factor of 0.1
net_dw10 <- get_adj(all_edges_wgt, all_edges_wgt$implicitdw10_wgt)
# Graph where all implicit edges are assigned weight of 0
net_dw0 <- get_adj(all_edges_wgt, all_edges_wgt$explicitonly_wgt)
# Graph where edges weighted by percent of vouchers administered by PHA,
# and implicit edges are further downweighted b factor of 0.25
net_vmsdw25 <- get_adj(all_edges_wgt, all_edges_wgt$voucher_implicitdw25_wt)

# Calculate PageRank centrality
# Set damping factor for main analysis
damp_fact <- 0.85
pr_main <- igraph::page_rank(net_dw25, directed = TRUE, 
                             damping = damp_fact)$vector

#####################
# Main Text Outputs #
#####################

pref_descrip <- read_csv(here("data", "pref_summary_final_122022.csv")) %>%
  mutate(category = str_replace_all(category, " ", "_"))

# Produce Main Figure 4: 
# Bar graph showing centrality based on PageRank
# with downweighting of implicit bt 0.25
# and bar coloring based on preference category
num_cats <- length(table(pref_descrip$type_new_long))
colors_bars <- colorblind_pal()(num_cats)
colors_labels <- ifelse(colors_bars == "#000000", "#FFFFFF", "#000000")

data.frame(category = names(pr_main),
           page_rank = pr_main) %>%
  left_join(pref_descrip, by = "category") %>%
  ggplot(aes(x = reorder(label, page_rank), y = page_rank, fill = type_new_long, 
             # For setting color of interior label
             color = ifelse(type_new_long %in% 
                              c("Age and household structure", 
                                "Ties to community or formal institutions"),
                            "#000000", "#FFFFFF"))) +
                              geom_bar(stat = "identity", 
                                       width = 0.85, size = 0)  +
  # For setting color of interior label
  scale_color_manual(values = colors_labels) +
  # For setting color of bars 
  scale_fill_manual(values = colors_bars) +
  guides(colour = FALSE) + 
  ylab("PageRank") + xlab("Category") + labs(fill = "Category type") + 
  coord_flip() +
  # Flag categories discussed in CFR 
  geom_text(aes(label = ifelse(cfr_designation == TRUE, "CFR", "")), 
            size=2.2, hjust = 1.2, show.legend  = FALSE) +
  annotate(geom = "text", color = "#000000", size= 3, 
           label = "CFR     Discussed in Code of Federal Regulations",
           x = 23.0, y = 0.052, hjust = 0) +
  theme_minimal() + 
  theme(legend.position = c(.75, .2)) 

ggsave(here("figs", "hierarchy.png"), 
       plot = last_plot(),
       device = "png", width = 10, height = 9,
       dpi = 300)

# Produce Main Table 3. Descriptions of Most Highly Ranked Categories 
#                       across Housing Authorities
# and Table A1: Descriptions of Lower-Ranked Categories 
#                       across Housing Authorities
# Table sorted by PageRank with descriptions for top-ranked categories
cat_desc <- data.frame(category = names(pr_main),
                       page_rank = pr_main) %>%
  left_join(pref_descrip, by = c("category")) %>%
  arrange(desc(page_rank))

print(xtable(transmute(cat_desc, 
                              Category = label, page_rank, 
                              Description = description),
             digits = c(3, 1, 3, 2)),
      include.rownames = FALSE)

#############################
# ONLINE SUPPLEMENT OUTPUTS #
#############################

# Produce Table S3: Category hierarchy rankings based on alternate 
#                   measures of centrality.
# All use network with downweight of .25 for implicit edges 
alt_measures_tab <- data.frame(category = names(pr_main),
                               in_degree = graph.strength(net_dw25, mode = "in"), 
                               out_degree = graph.strength(net_dw25, mode = "out"),
                               page_rank = pr_main) %>%
  mutate(degree_diff = in_degree - out_degree, 
         prop_in = in_degree / (in_degree + out_degree),
         prop_out = 1 - prop_in) %>%
  arrange(desc(page_rank)) %>%
  select(category, prop_in,
         degree_diff, 
         page_rank) %>%
  mutate(rank_prop_in = dense_rank(-prop_in),
         rank_degreediff = dense_rank(-degree_diff),
         rank_pagerank = dense_rank(-page_rank)) %>%
  left_join(pref_descrip, by = "category") %>%
  select(label, rank_pagerank, page_rank, rank_prop_in, prop_in, 
         rank_degreediff, degree_diff)

print(xtable(alt_measures_tab,
             digits = c(2, 2, 2, 3, 2, 2, 2, 1)), include.rownames  = FALSE) 

# Produce Table S4: Comparison of PageRank scores and ranks with different 
# amounts of down-weighting implicit ties.
# Sorted by ranking in main specification 
alt_dw_tab <- data.frame(category = names(pr_main),
                         page_rank_dw0 = igraph::page_rank(net_dw0, directed = TRUE, 
                                                           damping = damp_fact)$vector,
                         page_rank_dw10 = igraph::page_rank(net_dw10, directed = TRUE, 
                                                            damping = damp_fact)$vector,
                         page_rank_dw25 = pr_main,
                         page_rank_nodw = igraph::page_rank(net_nodw, directed = TRUE, 
                                                            damping = damp_fact)$vector) %>% 
  arrange(desc(page_rank_dw25)) %>%
  mutate(rank_dw0 = dense_rank(-page_rank_dw0),
         rank_dw10 = dense_rank(-page_rank_dw10),
         rank_dw25 = dense_rank(-page_rank_dw25),
         rank_nodw = dense_rank(-page_rank_nodw)) %>%
  left_join(pref_descrip, by = "category") %>%
  select(label, rank_dw0, page_rank_dw0, rank_dw10, page_rank_dw10, 
         rank_dw25, page_rank_dw25, rank_nodw, page_rank_nodw)


print(xtable(alt_dw_tab,
             digits = c(1,1,3,3,1,3,1,3,1,3)), include.rownames  = FALSE) 

# Produce Table S5: Category rankings according to PageRank scores, 
#                    given different values for the damping factor d 
damping_facts <- seq(.15, .95, .1)
pagerank_damp_mat <- sapply(damping_facts, 
                            function(d) igraph::page_rank(net_dw25, directed = TRUE, 
                                                          damping = d)$vector)
colnames(pagerank_damp_mat) <- paste0("pagerank_", damping_facts * 100)

page_rank_damp_tab <- data.frame(category = rownames(pagerank_damp_mat), 
                                 pagerank_damp_mat) %>%
  gather(key = "damping_set", value = "score", -category) %>%
  group_by(damping_set) %>%
  arrange(desc(score)) %>%
  mutate(rank = row_number()) %>%
  ungroup() %>%
  mutate(damping_set = as.numeric(str_remove(damping_set, "pagerank_"))) %>%
  gather(key = "key", value = "value", -category, -damping_set) %>%
  mutate(variable = paste0(key, "_", damping_set)) %>%
  select(-key, -damping_set) %>%
  spread(variable, value) %>%
  arrange(rank_85) %>%
  left_join(pref_descrip, by = "category")

print(xtable(select(page_rank_damp_tab, label, starts_with("rank")),
             digits = rep(0,11)), include.rownames  = FALSE) 

# Produce Table S6: Comparison of PageRank scores and ranks, 
#          with and without weighting by voucher program size
# Compares results when all PHAs are weighed evenly (main specification) vs 
# when edges are weighed by the percent of vouchers a given PHA administers 
vms_wgt_tab <- data.frame(category = names(pr_main),
                          page_rank_dw25 = pr_main,
                          page_rank_vmsdw25 = 
                            igraph::page_rank(net_vmsdw25, directed = TRUE, 
                                              damping = damp_fact)$vector) %>% 
  arrange(desc(page_rank_dw25)) %>%
  mutate(rank_dw25 = dense_rank(-page_rank_dw25),
         rank_vmsdw25  = dense_rank(-page_rank_vmsdw25)) %>%
  left_join(pref_descrip, by = "category") %>%
  select(label, rank_dw25, page_rank_dw25,
         rank_vmsdw25, page_rank_vmsdw25) %>%
  mutate(rank_diff = rank_dw25 - rank_vmsdw25)

print(xtable(vms_wgt_tab,
             digits = c(1,1,3,3,1,3,1)), include.rownames  = FALSE) 
